{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn import metrics\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.datasets import load_wine\n",
    "from sklearn.pipeline import make_pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif']=['SimHei']\n",
    "plt.rcParams['axes.unicode_minus']=False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "features, target = load_wine(return_X_y=True) # 三分类的酒数据集\n",
    "features.shape, target.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "RANDOM_STATE = 42\n",
    "# 将数据切分成7：3分别作为训练集和测试集\n",
    "X_train, X_test, y_train, y_test = train_test_split(features, target,\n",
    "                                                    test_size=0.30,\n",
    "                                                    random_state=RANDOM_STATE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 不使用PCA\n",
    "raw_clf = make_pipeline(StandardScaler(), LogisticRegression())\n",
    "raw_clf.fit(X_train, y_train)\n",
    "pred_test_raw = raw_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 使用PCA但不做数据预处理\n",
    "unscaled_clf = make_pipeline(PCA(n_components=2), LogisticRegression())\n",
    "unscaled_clf.fit(X_train, y_train)\n",
    "pred_test = unscaled_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 使用PCA，同时做数据预处理\n",
    "std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), LogisticRegression())\n",
    "std_clf.fit(X_train, y_train)\n",
    "pred_test_std = std_clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 查看各情况下的分类准确率\n",
    "print u'\\n不使用PCA，预测准确率:', '{:.2%}'.format(metrics.accuracy_score(y_test, pred_test_raw))\n",
    "print u'\\n使用PCA但不预处理，预测准确率:', '{:.2%}'.format(metrics.accuracy_score(y_test, pred_test))\n",
    "print u'\\n使用PCA且预处理，预测准确率:', '{:.2%}'.format(metrics.accuracy_score(y_test, pred_test_std))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 将pca信息抽取出来\n",
    "pca = unscaled_clf.named_steps['pca']\n",
    "pca_std = std_clf.named_steps['pca']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 打印最主要的主成分。注意，它是特征空间中的主成分轴，表达了数据中具有最大方差的方向\n",
    "print u'\\n未预处理第一主成分:\\n', pca.components_[0]\n",
    "print u'\\n预处理第一主成分:\\n', pca_std.components_[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 对训练数据进行PCA降维以备绘图\n",
    "X_train_nostd = pca.transform(X_train)\n",
    "scaler = std_clf.named_steps['standardscaler']\n",
    "X_train_std = pca_std.transform(scaler.transform(X_train))\n",
    "\n",
    "FIG_SIZE = (10, 7)\n",
    "\n",
    "fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=FIG_SIZE)\n",
    "\n",
    "# 不预处理的PCA\n",
    "for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):\n",
    "    ax1.scatter(X_train_nostd[y_train == l, 0], X_train_nostd[y_train == l, 1],\n",
    "                color=c,\n",
    "                label='class %s' % l,\n",
    "                alpha=0.5,\n",
    "                marker=m\n",
    "                )\n",
    "\n",
    "# 预处理后的PCA\n",
    "for l, c, m in zip(range(0, 3), ('blue', 'red', 'green'), ('^', 's', 'o')):\n",
    "    ax2.scatter(X_train_std[y_train == l, 0], X_train_std[y_train == l, 1],\n",
    "                color=c,\n",
    "                label='class %s' % l,\n",
    "                alpha=0.5,\n",
    "                marker=m\n",
    "                )\n",
    "\n",
    "ax1.set_title(u'PCA降维后的训练集')\n",
    "ax2.set_title(u'特征放缩+PCA降维后的训练集')\n",
    "\n",
    "for ax in (ax1, ax2):\n",
    "    ax.set_xlabel(u'第一主成分')\n",
    "    ax.set_ylabel(u'第二主成分')\n",
    "    ax.legend(loc='upper right')\n",
    "    ax.grid()\n",
    "\n",
    "plt.tight_layout()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
